1-QC-LiverDisease
2025-07-21
1 Executive Summary
| Study | Phase 1b Clinical Trial (NCT03299946) |
| Intervention | cabozantinib + nivolumab |
| Sponsor/Collab | John Hopkins + Exelixis |
| Disease | hepatocellular carcinoma (HCC) |
| Tissue | Liver |
| Spatial Data Accession | GSE238264 |
| Sample Numbers | 15 patients. (5 achieved pathologic response) |
| Platform | 10x Genomics Visium spatial transcriptomics |
What is Cabozantinib
Click for Details
- An oral targeted therapy
- Works by acting as a multi-tyrosine kinase inhibitor targeting VEGFR2, c-Met, RET, TIE2, and AXL involved in angiogenesis and tumor growth.
- By inhibiting these pathways, it starves tumors of blood supply and blocks metastatic signaling.
What is Nivolumab
Click for Details
- A monoclonal Antibody.
- Blocks the PD-1 immune checkpoint receptor on T cells.
- By Inhibiting the PD-1/PD-L1 interaction, it prevents cancer cells from turning off the immune response, essentially removing the “brakes” on T cell activation
- Allows the patient’s own immune system to recognize and attack tumor cells.
Code
# Core analysis packages
library(Seurat)
library(SeuratObject)
library(ggplot2)
library(dplyr)
library(tidyr)
library(data.table)
# Spatial transcriptomics specific
library(spatstat)
library(Matrix)
library(rjson)
library(cowplot)
library(RColorBrewer)
library(viridis)
library(DT)
library(hdf5r)
# Single cell integration
library(harmony)
library(clustree)
library(scran)
library(scater)
# Drug discovery and pathway analysis
library(clusterProfiler)
library(org.Hs.eg.db)
library(ReactomePA)
library(GSVA)
library(msigdbr)
library(enrichplot)
# Cell-cell communication
library(CellChat)
library(nichenetr)
# Statistical analysis
library(ComplexHeatmap)
library(circlize)
library(corrplot)
library(survival)
library(survminer)
# Biomarker discovery
library(randomForest)
library(caret)
library(pROC)
library(ROCR)
# Set theme for consistent plotting
theme_set(
theme_bw() + theme(
plot.title = element_text(
size = 14
, face = "bold"
)
, axis.title = element_text(size = 12)
, axis.text = element_text(size = 10)
, legend.text = element_text(size = 10)
)
)
# Set random seed for reproducibility
set.seed(42)Code
# Create directory structure
workDir <- "G:/SpatialOmics/Tx_Drug_Discovery"
dirs <- c(
"data/raw/spatial"
, "data/raw/single_cell"
, "data/processed"
, "results/figures"
, "results/tables"
, "results/biomarkers"
, "results/drug_targets"
)
sapply(dirs, function(dir) {
full_path <- file.path(workDir, dir)
if (!dir.exists(full_path)) {
dir.create(
path = full_path
, recursive = TRUE
)
}
})2 Spatial Transcriptomics Data (GSE238264)
2.1 ST - Get Metadata
Code
# Function to download and load spatial transcriptomics data
load_spatial_metadata <- function(
geo_accession = "some accession"
, save_dir = "path/to/save/dir"
) {
message(
"Loading spatial transcriptomics data from "
, geo_accession
)
# Load required packages
if (!require("GEOquery", quietly = TRUE)) {
BiocManager::install("GEOquery")
library(GEOquery)
}
message("Downloading spatial transcriptomics data from ", geo_accession)
# Download GEO data
gse <- getGEO(geo_accession, GSEMatrix = TRUE, AnnotGPL = FALSE)
# Extract expression data and sample metadata
if (length(gse) > 1) {
# Multiple platforms - take the first one or specify which one you want
eset <- gse[[1]]
} else {
eset <- gse[[1]]
}
# Extract expression matrix
expression_data <- exprs(eset)
# Extract sample metadata
sample_metadata <- pData(eset)
# Save the raw data
write.csv(expression_data,
file = file.path(save_dir, "expression_matrix.csv"))
write.csv(sample_metadata,
file = file.path(save_dir, "sample_metadata.csv"),
row.names = FALSE)
message("Downloaded data for ", ncol(expression_data), " samples")
message("Expression data: ", nrow(expression_data), " features")
return(list(
expression_data = expression_data,
sample_metadata = sample_metadata
))
return(sample_metadata)
}
# Load the sample metadata
raw_spatial_data <- load_spatial_metadata(
geo_accession = "GSE238264"
, save_dir = file.path(workDir, "data/raw/spatial" )
)
# TODO: Manual Clean up
sample_metadata<- raw_spatial_data$sample_metadata %>%
# Delete columns with unchanging values
dplyr::select(where(~ length(unique(.x)) > 1)) %>%
dplyr::select(geo=geo_accession, response=`phenotype:ch1`, sample_id=supplementary_file_1) %>%
mutate( response = ifelse(response %in% "responder", "Responder", "Non_Responder")
, sample_id = gsub(".*_|.tar.gz", "", sample_id)
# Here I make up some metadata for analysis purposes
, pathologic_response = ifelse(response %in% "Responder", "Major", "None")
, cabozantinib_dose = rep("60mg_daily", 7)
, nivolumab_dose = rep("240mg_Q2W", 7)
, treatment_duration = rep("8_weeks", 7)
, tumor_size_baseline = c(8.2, 6.5, 12.1, 9.8, 7.3, 11.2, 13.5)
, tumor_size_post = c(1.2, 1.8, 2.1, 2.3, 9.8, 12.1, 8.2)
, age = c(62, 58, 71, 66, 59, 73, 68)
, etiology = c("HBV", "HCV", "NASH", "HBV", "Alcohol", "HCV", "HBV")
)
head(sample_metadata %>% dplyr::select(sample_id, response, tumor_size_baseline, tumor_size_post, age, etiology))## sample_id response tumor_size_baseline tumor_size_post age etiology
## GSM7661255 HCC1R Responder 8.2 1.2 62 HBV
## GSM7661256 HCC2R Responder 6.5 1.8 58 HCV
## GSM7661257 HCC3R Responder 12.1 2.1 71 NASH
## GSM7661258 HCC4R Responder 9.8 2.3 66 HBV
## GSM7661259 HCC5NR Non_Responder 7.3 9.8 59 Alcohol
## GSM7661260 HCC6NR Non_Responder 11.2 12.1 73 HCV
Code
fetch_eset <- function(geo_accession="someAccession"
, save_dir = "path/to/save/dir") {
message("Downloading HCC scRNA-seq reference data from ", geo_accession)
# Download GEO metadata
gse <- (getGEO(geo_accession, GSEMatrix = TRUE, AnnotGPL = FALSE))
if (length(gse) > 1) {
eset <- gse[[1]] # Take first platform
} else {
eset <- gse[[1]]
}
#Get expression data
options(timeout = 400) # Increase timeout
getGEOSuppFiles(geo_accession, makeDirectory = TRUE, baseDir = save_dir )
}2.2 ST - Get Spatial Data
Code
#~~~~~~~~~~~~~~~~~~~
# Load Esets
#~~~~~~~~~~~~~~~~~~
rawfiles <- file.path(workDir,"data/raw/spatial/GSE238264")
sample_paths <- list.files(rawfiles, full.names = T)
sample_names <- basename(sample_paths)
# Function to load multiple 10x Visium samples
load_multiple_spatial_samples <- function(sample_paths, sample_names) {
spatial_list <- list()
for(i in 1:length(sample_paths)) {
cat("Loading", sample_names[i], "...\n")
# Load 10x Visium data
spatial_data <- Load10X_Spatial(
data.dir = sample_paths[i],
filename = "filtered_feature_bc_matrix.h5", # or matrix.mtx.gz
assay = "Spatial",
slice = sample_names[i]
)
# Add sample metadata
spatial_data$sample_id <- sample_names[i]
#spatial_data$patient_id <- sample_names[i] # Adjust based on your naming
spatial_list[[sample_names[i]]] <- spatial_data
}
return(spatial_list)
}
# Load all samples
spatial_samples <- load_multiple_spatial_samples(
sample_paths = sample_paths,
sample_names = sample_names
)## Loading HCC1R ...
## Loading HCC2R ...
## Loading HCC3R ...
## Loading HCC4R ...
## Loading HCC5NR ...
## Loading HCC6NR ...
## Loading HCC7NR ...
2.3 ST - Quality Control
Code
# Function to calculate QC metrics
calculate_spatial_qc <- function(spatial_obj) {
# Basic metrics
spatial_obj$nCount_Spatial <- colSums(spatial_obj@assays$Spatial$counts)
spatial_obj$nFeature_Spatial <- colSums(spatial_obj@assays$Spatial$counts > 0)
# Mitochondrial genes
spatial_obj$percent.mt <- PercentageFeatureSet(spatial_obj, pattern = "^MT-|^mt")
# Ribosomal genes
spatial_obj$percent.ribo <- PercentageFeatureSet(spatial_obj, pattern = "^RP[SL]|rp[sl]")
# Hemoglobin genes (contamination)
spatial_obj$percent.hb <- PercentageFeatureSet(spatial_obj, pattern = "^HB[^(P)]|hb[^(p)]")
# Log10 transformation for better visualization
spatial_obj$log10_nCount <- log10(spatial_obj$nCount_Spatial + 1)
spatial_obj$log10_nFeature <- log10(spatial_obj$nFeature_Spatial + 1)
return(spatial_obj)
}
# Apply QC to all samples
spatial_samples <- lapply(spatial_samples, calculate_spatial_qc)2.3.1 Visualise QC Metrics
Code
# Calculate QC Metrics
theme_spatial <- theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1, size = 8),
axis.text.y = element_text(size = 8),
axis.title = element_text(size = 10),
plot.title = element_text(size = 11),
legend.text = element_text(size = 8),
strip.text = element_text(size = 9)
)
# Function for comprehensive QC plots
plot_spatial_qc <- function(spatial_list, sample_metadata) {
library(ggplot2)
library(patchwork)
# Extract QC metrics from all samples
qc_data <- data.frame()
for(sample_name in names(spatial_list)) {
obj <- spatial_list[[sample_name]]
sample_qc <- data.frame(
sample_id = sample_name,
nCount_Spatial = obj$nCount_Spatial,
nFeature_Spatial = obj$nFeature_Spatial,
percent.mt = obj$percent.mt,
percent.ribo = obj$percent.ribo,
log10_nCount = obj$log10_nCount,
log10_nFeature = obj$log10_nFeature
)
qc_data <- rbind(qc_data, sample_qc)
}
# Merge with response metadata
qc_data <- merge(qc_data, sample_metadata, by = "sample_id")
# Calculate correlations for annotation
correlations <- qc_data %>%
group_by(sample_id, response) %>%
summarise(
correlation = cor(nCount_Spatial, nFeature_Spatial),
.groups = 'drop'
)
# 1. Violin plots for key metrics
p1 <- ggplot(qc_data, aes(x = sample_id, y = nCount_Spatial, fill = response)) +
geom_violin() + geom_boxplot(width = 0.1) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "UMI Count per Spot", y = "nCount_Spatial") + theme_spatial
p2 <- ggplot(qc_data, aes(x = sample_id, y = nFeature_Spatial, fill = response)) +
geom_violin() + geom_boxplot(width = 0.1) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Genes per Spot", y = "nFeature_Spatial") + theme_spatial
p3 <- ggplot(qc_data, aes(x = sample_id, y = percent.mt, fill = response)) +
geom_violin() + geom_boxplot(width = 0.1) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Mitochondrial %", y = "percent.mt") + theme_spatial
# 2. Scatter plot with correlation coefficients
p4 <- ggplot(qc_data, aes(x = nCount_Spatial, y = nFeature_Spatial, color = response)) +
geom_point(alpha = 0.6) +
geom_smooth(method = "lm", se = TRUE, alpha = 0.3) + # Add trend lines
facet_wrap(~sample_id, scales = "free") +
geom_text(
data = correlations,
aes(label = paste("r =", round(correlation, 3))),
x = Inf, y = Inf, hjust = 1.1, vjust = 1.1,
color = "black", size = 3, inherit.aes = FALSE
) +
labs(title = "UMI vs Genes Correlation per Sample") +
theme_spatial
p5 <- ggplot(qc_data, aes(x = nCount_Spatial, y = percent.mt, color = response)) +
geom_point(alpha = 0.6) +
facet_wrap(~sample_id, scales = "free") +
labs(title = "UMI vs Mitochondrial %") + theme_spatial
# 3. Summary statistics
summary_stats <- qc_data %>%
group_by(sample_id, response) %>%
summarise(
median_UMI = median(nCount_Spatial),
mean_UMI = mean(nCount_Spatial),
median_genes = median(nFeature_Spatial),
median_mt = median(percent.mt),
n_spots = n(),
.groups = 'drop'
)
p6 <- ggplot(summary_stats, aes(x = sample_id, y = mean_UMI, fill = response)) +
geom_col() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Mean UMI per Sample", y = "Mean UMI Count") + theme_spatial
p7 <- ggplot(summary_stats, aes(x = sample_id, y = n_spots, fill = response)) +
geom_col() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Number of Spots per Sample", y = "Number of Spots") + theme_spatial
# Combine plots
combined_plot1 <- p1
combined_plot2 <- p2
combined_plot3 <- p3
combined_plot4 <- (p6 | p7)
combined_plot5 <- (p4)
combined_plot6 <- (p5)
return(list(
combined_plot1 = combined_plot1,
combined_plot2 = combined_plot2,
combined_plot3 = combined_plot3,
combined_plot4 = combined_plot4,
combined_plot5 = combined_plot6,
qc_data = qc_data,
summary_stats = summary_stats
))
}
# Generate QC plots
qc_results <- plot_spatial_qc(spatial_samples, sample_metadata)
print(qc_results$combined_plot1)## NULL
Code
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
# Spatial Distribution of QC Metrics
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
# Plot QC metrics overlaid on tissue
plot_spatial_qc_overlay <- function(spatial_obj, sample_name) {
p1 <- SpatialFeaturePlot(spatial_obj, features = "nCount_Spatial") +
ggtitle(paste(sample_name, "- UMI Count")) + theme_spatial
p2 <- SpatialFeaturePlot(spatial_obj, features = "nFeature_Spatial") +
ggtitle(paste(sample_name, "- Gene Count")) + theme_spatial
p3 <- SpatialFeaturePlot(spatial_obj, features = "percent.mt") +
ggtitle(paste(sample_name, "- Mitochondrial %")) + theme_spatial
combined <- (p1 | p2) / p3
return(combined)
}
# Plot for each sample
for(sample_name in names(spatial_samples)) {
cat("Plotting spatial QC for", sample_name, "\n")
p <- plot_spatial_qc_overlay(spatial_samples[[sample_name]], sample_name)
print(p)
}## Plotting spatial QC for HCC1R
## Plotting spatial QC for HCC2R
## Plotting spatial QC for HCC3R
## Plotting spatial QC for HCC4R
## Plotting spatial QC for HCC5NR
## Plotting spatial QC for HCC6NR
## Plotting spatial QC for HCC7NR
VERDICT: Remove HCC2R, as it seems have been very poorly sequences. Both Library Size and Genes detected is low in this sample. Most expression occur in edges which is a red flag.
2.3.2 Low Feature Spots
Code
sapply(
names(spatial_samples)
, function(dataset_name) {
seurat_obj <- spatial_samples[[dataset_name]]
# Sum up number of spots where a gene has non-zero count
gene_detection_rates <- Matrix::rowSums(
GetAssayData(
seurat_obj
, slot = "counts"
) > 0
)
data.frame(
`**Genes detected in >50% of spots:**` = sum(gene_detection_rates > ncol(seurat_obj)/2)
, `**Genes detected in >10% of spots:**` = sum(gene_detection_rates > ncol(seurat_obj)/10)
, `**Genes detected in <5% of spots:**` = sum(gene_detection_rates < ncol(seurat_obj)/20)
, check.names = FALSE
) %>%
t() %>%
knitr::kable(caption=dataset_name) %>%
print()
cat("\n")
}
, simplify = FALSE
)##
##
## Table: (\#tab:unnamed-chunk-3)HCC1R
##
## | | |
## |:------------------------------------|-----:|
## |**Genes detected in >50% of spots:** | 461|
## |**Genes detected in >10% of spots:** | 4165|
## |**Genes detected in <5% of spots:** | 29494|
##
##
##
## Table: (\#tab:unnamed-chunk-3)HCC2R
##
## | | |
## |:------------------------------------|-----:|
## |**Genes detected in >50% of spots:** | 69|
## |**Genes detected in >10% of spots:** | 889|
## |**Genes detected in <5% of spots:** | 34221|
##
##
##
## Table: (\#tab:unnamed-chunk-3)HCC3R
##
## | | |
## |:------------------------------------|-----:|
## |**Genes detected in >50% of spots:** | 419|
## |**Genes detected in >10% of spots:** | 4742|
## |**Genes detected in <5% of spots:** | 28874|
##
##
##
## Table: (\#tab:unnamed-chunk-3)HCC4R
##
## | | |
## |:------------------------------------|-----:|
## |**Genes detected in >50% of spots:** | 2742|
## |**Genes detected in >10% of spots:** | 10701|
## |**Genes detected in <5% of spots:** | 24131|
##
##
##
## Table: (\#tab:unnamed-chunk-3)HCC5NR
##
## | | |
## |:------------------------------------|-----:|
## |**Genes detected in >50% of spots:** | 827|
## |**Genes detected in >10% of spots:** | 4660|
## |**Genes detected in <5% of spots:** | 29447|
##
##
##
## Table: (\#tab:unnamed-chunk-3)HCC6NR
##
## | | |
## |:------------------------------------|-----:|
## |**Genes detected in >50% of spots:** | 3289|
## |**Genes detected in >10% of spots:** | 10215|
## |**Genes detected in <5% of spots:** | 24651|
##
##
##
## Table: (\#tab:unnamed-chunk-3)HCC7NR
##
## | | |
## |:------------------------------------|-----:|
## |**Genes detected in >50% of spots:** | 1040|
## |**Genes detected in >10% of spots:** | 6003|
## |**Genes detected in <5% of spots:** | 27880|
## $HCC1R
## NULL
##
## $HCC2R
## NULL
##
## $HCC3R
## NULL
##
## $HCC4R
## NULL
##
## $HCC5NR
## NULL
##
## $HCC6NR
## NULL
##
## $HCC7NR
## NULL
2.3.3 Outlier Detection
Code
outliers_df <- data.frame()
sapply(
names(spatial_samples)
, function(dataset_name) {
seurat_obj <- spatial_samples[[dataset_name]]
# UMI outliers using 1.5 * IQR rule
Q1_umi <- quantile(seurat_obj@meta.data$nCount_Spatial, 0.25)
Q3_umi <- quantile(seurat_obj@meta.data$nCount_Spatial, 0.75)
IQR_umi <- Q3_umi - Q1_umi
umi_outliers <- sum(seurat_obj@meta.data$nCount_Spatial < (Q1_umi - 1.5*IQR_umi) |
seurat_obj@meta.data$nCount_Spatial > (Q3_umi + 1.5*IQR_umi))
# Feature outliers
Q1_feat <- quantile(seurat_obj@meta.data$nFeature_Spatial, 0.25)
Q3_feat <- quantile(seurat_obj@meta.data$nFeature_Spatial, 0.75)
IQR_feat <- Q3_feat - Q1_feat
feat_outliers <- sum(seurat_obj@meta.data$nFeature_Spatial < (Q1_feat - 1.5*IQR_feat) |
seurat_obj@meta.data$nFeature_Spatial > (Q3_feat + 1.5*IQR_feat))
tmp <- data.frame(
# UMI outliers (1.5×IQR rule)
umi_outliers =paste(umi_outliers, "spots (", round(100*umi_outliers/ncol(seurat_obj), 1), "%)")
# Feature outliers (1.5×IQR rule)
, feat_outliers= paste(feat_outliers, "spots (", round(100*feat_outliers/ncol(seurat_obj), 1), "%)")
)
rbind(tmp,outliers_df )
}
)## HCC1R HCC2R HCC3R HCC4R HCC5NR HCC6NR HCC7NR
## umi_outliers "240 spots ( 8 %)" "379 spots ( 13.7 %)" "15 spots ( 0.7 %)" "0 spots ( 0 %)" "23 spots ( 1.7 %)" "128 spots ( 5 %)" "60 spots ( 2.4 %)"
## feat_outliers "127 spots ( 4.2 %)" "225 spots ( 8.1 %)" "0 spots ( 0 %)" "0 spots ( 0 %)" "61 spots ( 4.6 %)" "124 spots ( 4.8 %)" "28 spots ( 1.1 %)"
VERDICT: Only 69 house keeping genes (>50% spot) in HCC2R, marked for removal.
2.3.4 Predict Gender
Code
# Define sex-specific marker genes
sex_markers <- list(
# Y chromosome genes (male-specific)
Y_genes = c("RPS4Y1", "EIF1AY", "KDM5D", "UTY", "USP9Y", "DDX3Y", "TMSB4Y"),
# X chromosome markers
X_genes = c("XIST"), # X-inactivation gene (female-specific)
# Additional sex-linked genes
sex_genes = c("SRY", "ZFY", "AMELX", "AMELY")
)
# Function to predict gender from spatial data
predict_gender_spatial <- function(spatial_obj, sex_markers) {
# Get expression data
expr_data <- GetAssayData(spatial_obj, slot = "counts")
# Calculate Y gene expression score
Y_genes_present <- intersect(sex_markers$Y_genes, rownames(expr_data))
if(length(Y_genes_present) > 0) {
Y_score <- colMeans(expr_data[Y_genes_present, , drop = FALSE])
} else {
Y_score <- rep(0, ncol(expr_data))
}
# Calculate XIST expression (female marker)
if("XIST" %in% rownames(expr_data)) {
XIST_score <- expr_data["XIST", ]
} else {
XIST_score <- rep(0, ncol(expr_data))
}
# Create prediction scores
spatial_obj$Y_gene_score <- Y_score
spatial_obj$XIST_score <- XIST_score
# Predict gender based on thresholds
spatial_obj$predicted_gender <- ifelse(
Y_score > 0.1, "Male",
ifelse(XIST_score > 0.2, "Female", "Uncertain")
)
return(spatial_obj)
}
# Apply to all samples
spatial_samples_gender <- lapply(spatial_samples, function(obj) {
predict_gender_spatial(obj, sex_markers)
})
# Extract gender predictions from all samples into one dataframe
compile_gender_predictions <- function(spatial_samples_gender) {
results_df <- data.frame()
for(sample_name in names(spatial_samples_gender)) {
# Get the table of predictions for this sample
gender_table <- table(spatial_samples_gender[[sample_name]]$predicted_gender)
# Convert to dataframe with sample info
sample_df <- data.frame(
sample_id = sample_name,
gender = names(gender_table),
count = as.numeric(gender_table),
proportion = round(as.numeric(gender_table) / sum(gender_table), 3)
)
results_df <- rbind(results_df, sample_df)
}
return(results_df)
}
genderpred <- compile_gender_predictions(spatial_samples_gender) %>%
filter(!gender %in% "Uncertain") %>%
group_by(sample_id) %>%
slice_max(order_by = count, n = 1) %>%
ungroup() %>%
dplyr::select(sample_id, gender)
# Assign gender to gender column
for(i in 1:nrow(genderpred)) {
sample_name <- genderpred$sample_id[i]
if(sample_name %in% names(spatial_samples)) {
spatial_samples[[sample_name]]$gender <- genderpred$gender[i]
}
}
genderpred %>% knitr::kable()| sample_id | gender |
|---|---|
| HCC1R | Male |
| HCC2R | Male |
| HCC3R | Female |
| HCC4R | Male |
| HCC5NR | Female |
| HCC6NR | Female |
| HCC7NR | Female |
2.3.5 Filtering
| min_nCount_Spatial | 500 |
| min_nFeature_Spatial | 200 |
| max_nFeature_Spatial | 8000 |
| max_percent.mt | 20 |
Code
min_nCount_Spatial=500
min_nFeature_Spatial=200
max_nFeature_Spatial=6000
max_percent.mt=20
#~~~~~~~~~~~~~~~~~~~#
# Apply Filter
#~~~~~~~~~~~~~~~~~~~#
# Apply consistent filtering across all samples
filter_spatial_samples <- function(spatial_list, thresholds) {
filtered_samples <- list()
for(sample_name in names(spatial_list)) {
obj <- spatial_list[[sample_name]]
cat("Filtering", sample_name, "...\n")
cat(" Before:", ncol(obj), "spots\n")
# Apply filters
obj <- subset(obj,
subset = nCount_Spatial >= min_nCount_Spatial &
nFeature_Spatial >= min_nFeature_Spatial &
nFeature_Spatial <= max_nFeature_Spatial &
percent.mt <= max_percent.mt)
cat(" After:", ncol(obj), "spots\n")
filtered_samples[[sample_name]] <- obj
}
return(filtered_samples)
}
# Apply filtering
spatial_samples_filt <- filter_spatial_samples(spatial_samples, filtering_thresholds)## Filtering HCC1R ...
## Before: 3006 spots
## After: 2577 spots
## Filtering HCC2R ...
## Before: 2766 spots
## After: 1207 spots
## Filtering HCC3R ...
## Before: 2170 spots
## After: 1802 spots
## Filtering HCC4R ...
## Before: 3002 spots
## After: 1974 spots
## Filtering HCC5NR ...
## Before: 1320 spots
## After: 1248 spots
## Filtering HCC6NR ...
## Before: 2575 spots
## After: 2338 spots
## Filtering HCC7NR ...
## Before: 2453 spots
## After: 2450 spots
Note: 1 Sample (HCC2R) Removed
2.4 ST - Pre Processing
Code
if (!file.exists(file.path(workDir, "/data/processed", "spatial_samples_sct.RDS"))) {
# Function to apply SCTransform to all slices
apply_sctransform_to_all_slices <- function(seurat_obj, assay_of_interest = "Spatial") {
# Get all slice names
slice_names <- names(seurat_obj@images)
message("Found ", length(slice_names), " slices:", paste(slice_names, collapse = ", "), "\n")
# Apply SCTransform to the entire object (works on all slices)
message("Applying SCTransform normalization...\n")
seurat_obj <- SCTransform(
seurat_obj # Seurat object
, assay = assay_of_interest # input assay name
, method = "poisson" # use poisson model for spatial data
, verbose = FALSE # reduce output
)
message("SCTransform normalization completed successfully!\n")
message("New assay 'SCT' has been added to the Seurat object.\n")
# Set default assay to SCT for downstream analysis
DefaultAssay(seurat_obj) <- "SCT"
return(seurat_obj)
}
spatial_samples_sct <- sapply( spatial_samples_filt, apply_sctransform_to_all_slices)
saveRDS(spatial_samples_sct, file.path(workDir, "/data/processed", "spatial_samples_sct.RDS"))
} else {
spatial_samples_sct <- readRDS(file.path(workDir, "/data/processed", "spatial_samples_sct.RDS"))
}2.5 ST - Dim reduction
Code
if (!file.exists(file.path(workDir, "/data/processed", "spatial_samples_sct_dimr.RDS"))) {
dimreduce <- function(seurat_obj, assay_of_interest = "Spatial") {
# 4. Perform PCA
seurat_obj <- RunPCA(
seurat_obj # Seurat object
, assay = "SCT" # use SCT assay
, verbose = FALSE # reduce output
)
# 5. Find neighbors and clusters
seurat_obj <- FindNeighbors(
seurat_obj # Seurat object
, reduction = "pca" # use PCA for neighbor finding
, dims = 1:30 # use first 30 PCs
)
# Finding Gene Clusters
seurat_obj <- FindClusters(
seurat_obj
#, resolution = 0.5
, verbose = FALSE
)
# 6. Run UMAP
seurat_obj <- RunUMAP(
seurat_obj # Seurat object
, reduction = "pca" # use PCA for UMAP
, dims = 1:30 # use first 10 PCs
, verbose = FALSE # reduce output
)
}
spatial_samples_sct_dimr <- sapply(spatial_samples_sct, dimreduce)
saveRDS(spatial_samples_sct_dimr, file.path(workDir, "/data/processed", "spatial_samples_sct_dimr.RDS"))
} else {
spatial_samples_sct <- readRDS(file.path(workDir, "/data/processed", "spatial_samples_sct_dimr.RDS"))
}3 Single Cell Reference Data Preparations
| Data Type | Single Cell RNA-Seq |
| DataSet | GSE149614 |
| Publication | Yan et al |
3.1 sc - Download Data
Code
if(!file.exists(file.path(workDir, "/data/processed", "scrna_seuratRaw.RDS")) & !file.exists(file.path(workDir, "/data/processed", "hcc_scrna_reference.RDS")) ) {
geo_accession="GSE149614"
fetch_eset( geo_accession , save_dir=file.path(workDir,"data/raw/single_cell") )
#~~~~~~~~~~~~~~~~~~~~~~#
# Build Seurat Object
#~~~~~~~~~~~~~~~~~~~~~~#
# Set file paths
scdata_dir <- file.path(workDir,"data/raw/single_cell", "GSE149614")
count_file <- file.path(scdata_dir, "GSE149614_HCC.scRNAseq.S71915.count.txt.gz")
metadata_file <- file.path(scdata_dir, "GSE149614_HCC.metadata.updated.txt.gz")
# Get the header line (cell names)
header_line <- readLines(count_file, n = 1)
cell_names <- strsplit(header_line, "\t")[[1]]
# Read the data starting from line 2
message("Reading count matrix...")
counts <- fread(count_file, header = FALSE, skip = 1, data.table = FALSE)
# Set proper dimensions
gene_names <- counts[,1] # First column is gene names
counts <- as.matrix(counts[,-1]) # Remove gene column, convert to matrix
rownames(counts) <- gene_names
colnames(counts) <- cell_names
counts1 <- as.matrix(counts)
# Extract expression data and sample metadata
expression_data <- exprs(eset)
scSample_metadata <- pData(eset)
message("Processing scRNA-seq data for anchor-based integration...")
# 2. Read metadata
message("Reading metadata...")
metadata <- fread(metadata_file, header = TRUE, data.table = FALSE)
rownames(metadata) <- metadata$Cell # Match to cell names
# 3. Ensure cell names match
cat("Cell Names Match:", identical(colnames(counts), rownames(metadata) ))
# 4. Create Seurat object
message("Creating Seurat object...")
scrna_ref_raw <- CreateSeuratObject(
counts = counts,
meta.data = metadata,
project = "HCC_GSE149614",
min.cells = 3,
min.features = 200
)
saveRDS(scrna_ref, file.path(workDir, "/data/processed", "scrna_seuratRaw.RDS"))
} else {
scrna_ref <- readRDS(file.path(workDir, "/data/processed", "scrna_seuratRaw.RDS"))
}3.2 sc - Process Data
Here we SCTransform the single cell data to keep the noise model the same as the spatial data.
Code
if(!file.exists(file.path(workDir, "/data/processed", "hcc_scrna_reference.RDS")) ) {
# Standard scRNA-seq processing
Idents(scrna_ref) <- "celltype"
# Normalize Data
scrna_ref <- SCTransform (
scrna_ref
, ncells = 3000 # learns noise models on 3000 cells, whole dataset normalised with no loss in performance
, verbose = FALSE ) %>%
# PCA
RunPCA(verbose = FALSE) %>%
# UMAP
RunUMAP( dims = 1:30 )
# Save processed reference
saveRDS(scrna_ref, file = file.path(workDir, "/data/processed", "hcc_scrna_reference.RDS"))
} else {
scrna_ref <- readRDS(file.path(workDir, "/data/processed", "hcc_scrna_reference.RDS"))
}
message("scRNA-seq reference prepared for anchor-based integration")
table(scrna_ref$celltype)##
## B Endothelial Fibroblast Hepatocyte Myeloid T/NK
## 3685 3644 2266 20782 15947 25591